在自己設定不同難度的資料中,我想透過機器學習來嘗試找出最適配的模型
資料降維後的視覺化
不同 machine 的表現
資料的來源為我的 shiny apps
以下是我有使用的一些 packages
library(APLM)
library(magrittr)
library(purrr)
library(ggplot2)
library(gridExtra)
library(mvtnorm)
library(MASS)
library(umap)
library(Rtsne)
library(nnet)
library(DT)
library(corrplot)
library(RColorBrewer)
library(rpart)
library(rpart.plot)
library(GGally)
library(randomForest)
library(xgboost)
library(knitr)
資料生成時有設定種子碼以固定結果
set.seed(611011106)
在開始進行模型預測之前,我將資料進行 K-fold cross vaildation (K=5) 的處理
easytag <- kfcv(data=easy_data, seed=611011106, k=5)
midmtag <- kfcv(data=midm_data, seed=611011106, k=5)
midptag <- kfcv(data=midp_data, seed=611011106, k=5)
hardtag <- kfcv(data=hard_data, seed=611011106, k=5)
success <- 0
lda_tb <- list()
for(i in 1:5){
train <- geasy_data[-easytag[[i]],]
test <- geasy_data[easytag[[i]],]
plda <- lda(group~., train)
p <- predict(plda, test[,-1])$class
r <- group[easytag[[i]]]
lda_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 11 0
## 3 0 0 29
##
## [[2]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 20 0
## 3 0 0 20
##
## [[3]]
## r
## p 1 2 3
## 1 18 0 0
## 2 1 24 0
## 3 0 0 17
##
## [[4]]
## r
## p 1 2 3
## 1 23 0 0
## 2 0 24 0
## 3 0 0 13
##
## [[5]]
## r
## p 1 2 3
## 1 17 1 0
## 2 1 20 0
## 3 0 0 21
## 99 %
success <- 0
lg_tb <- list()
for(i in 1:5){
train <- geasy_data[-easytag[[i]],]
test <- geasy_data[easytag[[i]],]
lg <- multinom(group~., train)
p<- predict(lg, test[,-1])
r <- group[easytag[[i]]]
lg_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## Call:
## multinom(formula = group ~ ., data = train)
##
## Coefficients:
## (Intercept) X1 X2 X3 X4
## 2 -76.06613 -0.9971579 15.190423 1.797816 2.108939
## 3 -77.17384 8.0086608 -1.785702 5.205221 5.976934
##
## Std. Errors:
## (Intercept) X1 X2 X3 X4
## 2 159.9742 16.28883 49.90668 44.65194 16.27796
## 3 2405.1956 139.43649 377.00929 296.00723 447.05320
##
## Residual Deviance: 0.01058902
## AIC: 20.01059
## [[1]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 11 1
## 3 0 0 28
##
## [[2]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 20 0
## 3 0 0 20
##
## [[3]]
## r
## p 1 2 3
## 1 18 1 0
## 2 1 23 0
## 3 0 0 17
##
## [[4]]
## r
## p 1 2 3
## 1 22 0 0
## 2 0 24 0
## 3 1 0 13
##
## [[5]]
## r
## p 1 2 3
## 1 17 1 0
## 2 1 20 0
## 3 0 0 21
## 98 %
下面是累積到各個 PC 可以解釋的變異比例
如果只有 PC1 就只能解釋 58.28% 的變異
加上 PC2 後則可以解釋 80.39% 的變異
success <- 0
lgpc_tb <- list()
for(i in 1:5){
train <- geasy_data[-easytag[[i]], ]
test <- geasy_data[easytag[[i]], ]
Pca <- prcomp(train[,-1], center=TRUE, scale.=TRUE)
ptrain <- Pca[["x"]]
ptrain <- data.frame(geasy_data$group[-easytag[[i]]], ptrain)
colnames(ptrain)[1] <- "group"
lg <- multinom(group~PC1+PC2, data = ptrain)
ptest <- predict(Pca, test[,-1])
p <- predict(lg, ptest)
r <- geasy_data$group[easytag[[i]]]
lgpc_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## Call:
## multinom(formula = group ~ PC1 + PC2, data = ptrain)
##
## Coefficients:
## (Intercept) PC1 PC2
## 2 -20.97123 -10.78459 63.6664890
## 3 -13.86168 35.26689 0.6224278
##
## Std. Errors:
## (Intercept) PC1 PC2
## 2 235.7793 160.3935 459.7122
## 3 693.9337 1497.9336 3055.8864
##
## Residual Deviance: 0.0004086614
## AIC: 12.00041
## [[1]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 11 0
## 3 0 0 29
##
## [[2]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 20 0
## 3 0 0 20
##
## [[3]]
## r
## p 1 2 3
## 1 19 0 0
## 2 0 24 0
## 3 0 0 17
##
## [[4]]
## r
## p 1 2 3
## 1 23 1 0
## 2 0 23 0
## 3 0 0 13
##
## [[5]]
## r
## p 1 2 3
## 1 17 0 0
## 2 1 21 0
## 3 0 0 21
## 99.33 %
success <- 0
rt_tb <- list()
for(i in 1:5){
train <- geasy_data[-easytag[[i]],]
test <- geasy_data[easytag[[i]],]
rt <- rpart(group ~ ., data = train)
p <- predict(rt, test) %>% round()
pp <- c()
for(j in 1:dim(p)[1]){
pp[j] <- which(p[j,]==1)
}
p <- pp
r <- group[easytag[[i]]]
rt_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 11 0
## 3 0 0 29
##
## [[2]]
## r
## p 1 2 3
## 1 19 0 0
## 2 0 20 0
## 3 1 0 20
##
## [[3]]
## r
## p 1 2 3
## 1 18 0 1
## 2 1 24 0
## 3 0 0 16
##
## [[4]]
## r
## p 1 2 3
## 1 23 0 3
## 2 0 24 0
## 3 0 0 10
##
## [[5]]
## r
## p 1 2 3
## 1 17 1 0
## 2 1 20 0
## 3 0 0 21
## 97.33 %
success <- 0
rf_tb <- list()
for(i in 1:5){
train <- geasy_data[-easytag[[i]],]
test <- geasy_data[easytag[[i]],]
train$group <- train$group %>% as.factor()
rf <- randomForest(group~., train)
p<- predict(plda, test[,-1])$class
r <- group[easytag[[i]]]
rf_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 11 0
## 3 0 0 29
##
## [[2]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 20 0
## 3 0 0 20
##
## [[3]]
## r
## p 1 2 3
## 1 19 0 0
## 2 0 24 0
## 3 0 0 17
##
## [[4]]
## r
## p 1 2 3
## 1 23 0 0
## 2 0 24 0
## 3 0 0 13
##
## [[5]]
## r
## p 1 2 3
## 1 17 1 0
## 2 1 20 0
## 3 0 0 21
## 99.33 %
success <- 0
rf_tb <- list()
for(i in 1:5){
train <- geasy_data[-easytag[[i]],]
test <- geasy_data[easytag[[i]],]
train$group <- train$group %>% as.factor()
rf <- randomForest(group~., train, ntree=50, mtry=1)
p<- predict(plda, test[,-1])$class
r <- group[easytag[[i]]]
rf_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 11 0
## 3 0 0 29
##
## [[2]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 20 0
## 3 0 0 20
##
## [[3]]
## r
## p 1 2 3
## 1 19 0 0
## 2 0 24 0
## 3 0 0 17
##
## [[4]]
## r
## p 1 2 3
## 1 23 0 0
## 2 0 24 0
## 3 0 0 13
##
## [[5]]
## r
## p 1 2 3
## 1 17 1 0
## 2 1 20 0
## 3 0 0 21
## 99.33 %
## MeanDecreaseGini
## X1 47.54109
## X2 65.85372
## X3 20.41197
## X4 25.47672
geasy_data$group <- as.integer(geasy_data$group)-1
success <- 0
xgb_tb <- list()
for(i in 1:5){
train <- geasy_data[-easytag[[i]],]
test <- geasy_data[easytag[[i]],]
dtrain <- xgb.DMatrix(data = as.matrix(train[,-1]),
label = train$group)
dtest <- xgb.DMatrix(data = as.matrix(test[,-1]),
label = test$group)
xgb.model <- xgboost(data = dtrain,
objective='multi:softmax',
num_class=3,
print_every_n = 100,
max_depth = 1,
eta = 0.3,
gamma = 0,
subsample = 1,
colsample_bytree = 1,
min_child_weight = 12,
nround = 30)
# 如果要畫出 xgb 內的所有決策樹,可以用以下函式(但因為會很多,這裡就不畫了)
# xgb.plot.tree(model = xgb.model)
# 預測
p <- predict(xgb.model, dtest)+1
r <- geasy_data$group[easytag[[i]]]+1
xgb_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 11 0
## 3 0 0 29
##
## [[2]]
## r
## p 1 2 3
## 1 20 0 0
## 2 0 20 0
## 3 0 0 20
##
## [[3]]
## r
## p 1 2 3
## 1 19 0 0
## 2 0 24 0
## 3 0 0 17
##
## [[4]]
## r
## p 1 2 3
## 1 23 0 1
## 2 0 24 0
## 3 0 0 12
##
## [[5]]
## r
## p 1 2 3
## 1 17 1 0
## 2 1 20 0
## 3 0 0 21
## 99 %
success <- 0
lda_tb <- list()
for(i in 1:5){
train <- gmidm_data[-midmtag[[i]],]
test <- gmidm_data[midmtag[[i]],]
plda <- lda(group~., train)
p<- predict(plda, test[,-1])$class
r <- group[midmtag[[i]]]
lda_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3
## 1 16 2 0
## 2 3 8 0
## 3 1 1 29
##
## [[2]]
## r
## p 1 2 3
## 1 20 1 0
## 2 0 18 1
## 3 0 1 19
##
## [[3]]
## r
## p 1 2 3
## 1 16 0 1
## 2 2 22 1
## 3 1 2 15
##
## [[4]]
## r
## p 1 2 3
## 1 17 0 0
## 2 6 22 0
## 3 0 2 13
##
## [[5]]
## r
## p 1 2 3
## 1 18 3 0
## 2 0 17 1
## 3 0 1 20
## 90 %
success <- 0
lg_tb <- list()
for(i in 1:5){
train <- gmidm_data[-midmtag[[i]],]
test <- gmidm_data[midmtag[[i]],]
lg <- multinom(group~., train)
p<- predict(lg, test[,-1])
r <- group[midmtag[[i]]]
lg_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## Call:
## multinom(formula = group ~ ., data = train)
##
## Coefficients:
## (Intercept) X1 X2 X3 X4
## 2 -7.761984 -1.553083 1.767898 0.256090 0.2545674
## 3 -16.005019 1.597211 2.682378 -1.255145 1.3873693
##
## Std. Errors:
## (Intercept) X1 X2 X3 X4
## 2 1.847236 0.3093902 0.3173634 0.2786558 0.2539864
## 3 3.350834 0.5032890 0.4730883 0.4704229 0.3918130
##
## Residual Deviance: 134.3401
## AIC: 154.3401
## [[1]]
## r
## p 1 2 3
## 1 16 2 0
## 2 3 8 1
## 3 1 1 28
##
## [[2]]
## r
## p 1 2 3
## 1 20 1 1
## 2 0 18 1
## 3 0 1 18
##
## [[3]]
## r
## p 1 2 3
## 1 15 1 1
## 2 3 22 1
## 3 1 1 15
##
## [[4]]
## r
## p 1 2 3
## 1 17 1 0
## 2 6 22 0
## 3 0 1 13
##
## [[5]]
## r
## p 1 2 3
## 1 18 3 0
## 2 0 17 1
## 3 0 1 20
## 89 %
下面是累積到各個 PC 可以解釋的變異比例
如果只有 PC1 就只能解釋 39.75% 的變異
加上 PC2 後則可以解釋 66.96% 的變異
再加上 PC3 後則可以解釋 86.31% 的變異
success <- 0
lgpc_tb <- list()
for(i in 1:5){
train <- gmidm_data[-midmtag[[i]], ]
test <- gmidm_data[midmtag[[i]], ]
Pca <- prcomp(train[,-1], center=TRUE, scale.=TRUE)
ptrain <- Pca[["x"]]
ptrain <- data.frame(gmidm_data$group[-midmtag[[i]]], ptrain)
colnames(ptrain)[1] <- "group"
lg <- multinom(group~PC1+PC2+PC3, data = ptrain)
ptest <- predict(Pca, test[-1])
p <- predict(lg, ptest)
r <- gmidm_data$group[midmtag[[i]]]
lgpc_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## Call:
## multinom(formula = group ~ PC1 + PC2 + PC3, data = ptrain)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3
## 2 -0.2916505 -0.3613469 -3.3651139 1.1716770
## 3 -0.9192512 -3.8961902 -0.6990297 -0.2482348
##
## Std. Errors:
## (Intercept) PC1 PC2 PC3
## 2 0.3916937 0.3347252 0.5172701 0.3449087
## 3 0.4983293 0.6204812 0.4658587 0.4378014
##
## Residual Deviance: 153.1981
## AIC: 169.1981
## [[1]]
## r
## p 1 2 3
## 1 16 3 1
## 2 3 7 0
## 3 1 1 28
##
## [[2]]
## r
## p 1 2 3
## 1 19 1 1
## 2 0 19 1
## 3 1 0 18
##
## [[3]]
## r
## p 1 2 3
## 1 14 1 1
## 2 3 22 1
## 3 2 1 15
##
## [[4]]
## r
## p 1 2 3
## 1 17 1 1
## 2 6 22 0
## 3 0 1 12
##
## [[5]]
## r
## p 1 2 3
## 1 18 3 0
## 2 0 17 2
## 3 0 1 19
## 87.67 %
加上 PC3 的結果比沒加時預測準確度高出 1%
success <- 0
rt_tb <- list()
for(i in 1:5){
train <- gmidm_data[-midmtag[[i]],]
test <- gmidm_data[midmtag[[i]],]
rt <- rpart(group ~ ., data = train)
p <- predict(rt, test) %>% round()
pp <- c()
for(j in 1:dim(p)[1]){
pp[j] <- which(p[j,]==max(p[j,]))
}
p <- pp
r <- group[midmtag[[i]]]
rt_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3
## 1 13 1 4
## 2 4 9 2
## 3 3 1 23
##
## [[2]]
## r
## p 1 2 3
## 1 20 0 2
## 2 0 19 3
## 3 0 1 15
##
## [[3]]
## r
## p 1 2 3
## 1 9 0 0
## 2 4 20 1
## 3 6 4 16
##
## [[4]]
## r
## p 1 2 3
## 1 17 5 0
## 2 4 15 0
## 3 2 4 13
##
## [[5]]
## r
## p 1 2 3
## 1 18 7 1
## 2 0 14 1
## 3 0 0 19
## 80 %
success <- 0
rf_tb <- list()
for(i in 1:5){
train <- gmidm_data[-midmtag[[i]],]
test <- gmidm_data[midmtag[[i]],]
train$group <- train$group %>% as.factor()
rf <- randomForest(group~., train)
p<- predict(plda, test[,-1])$class
r <- group[midmtag[[i]]]
rf_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3
## 1 16 2 0
## 2 3 8 0
## 3 1 1 29
##
## [[2]]
## r
## p 1 2 3
## 1 20 1 1
## 2 0 18 1
## 3 0 1 18
##
## [[3]]
## r
## p 1 2 3
## 1 16 1 1
## 2 2 20 1
## 3 1 3 15
##
## [[4]]
## r
## p 1 2 3
## 1 17 2 0
## 2 6 20 0
## 3 0 2 13
##
## [[5]]
## r
## p 1 2 3
## 1 18 3 0
## 2 0 17 1
## 3 0 1 20
## 88.33 %
## mtry = 2 OOB error = 14.17%
## Searching left ...
## mtry = 1 OOB error = 15%
## -0.05882353 0.05
## Searching right ...
## mtry = 4 OOB error = 16.25%
## -0.1470588 0.05
## mtry OOBError
## 1.OOB 1 0.1500000
## 2.OOB 2 0.1416667
## 4.OOB 4 0.1625000
success <- 0
rf_tb <- list()
for(i in 1:5){
train <- gmidm_data[-midmtag[[i]],]
test <- gmidm_data[midmtag[[i]],]
train$group <- train$group %>% as.factor()
rf <- randomForest(group~., train, ntree=100, mtry=2)
p<- predict(plda, test[,-1])$class
r <- group[midmtag[[i]]]
rf_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3
## 1 16 2 0
## 2 3 8 0
## 3 1 1 29
##
## [[2]]
## r
## p 1 2 3
## 1 20 1 1
## 2 0 18 1
## 3 0 1 18
##
## [[3]]
## r
## p 1 2 3
## 1 16 1 1
## 2 2 20 1
## 3 1 3 15
##
## [[4]]
## r
## p 1 2 3
## 1 17 2 0
## 2 6 20 0
## 3 0 2 13
##
## [[5]]
## r
## p 1 2 3
## 1 18 3 0
## 2 0 17 1
## 3 0 1 20
## 88.33 %
## MeanDecreaseGini
## X1 60.33646
## X2 58.45880
## X3 19.16287
## X4 21.39738
gmidm_data$group <- as.integer(gmidm_data$group)-1
success <- 0
xgb_tb <- list()
for(i in 1:5){
train <- gmidm_data[-midmtag[[i]],]
test <- gmidm_data[midmtag[[i]],]
dtrain <- xgb.DMatrix(data = as.matrix(train[,-1]),
label = train$group)
dtest <- xgb.DMatrix(data = as.matrix(test[,-1]),
label = test$group)
xgb.model <- xgboost(data = dtrain,
objective='multi:softmax',
num_class=3,
print_every_n = 100,
max_depth = 3,
eta = 0.3,
gamma = 0,
subsample = 1,
colsample_bytree = 1,
min_child_weight = 12,
nround = 150)
# 如果要畫出 xgb 內的所有決策樹,可以用以下函式(但因為會很多,這裡就不畫了)
# xgb.plot.tree(model = xgb.model)
# 預測
p <- predict(xgb.model, dtest)+1
r <- gmidm_data$group[midmtag[[i]]]+1
xgb_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3
## 1 16 2 1
## 2 3 8 0
## 3 1 1 28
##
## [[2]]
## r
## p 1 2 3
## 1 19 0 1
## 2 1 17 2
## 3 0 3 17
##
## [[3]]
## r
## p 1 2 3
## 1 16 1 1
## 2 2 20 1
## 3 1 3 15
##
## [[4]]
## r
## p 1 2 3
## 1 17 0 0
## 2 6 21 0
## 3 0 3 13
##
## [[5]]
## r
## p 1 2 3
## 1 18 5 0
## 2 0 15 1
## 3 0 1 20
## 86.67 %
success <- 0
lda_tb <- list()
for(i in 1:5){
train <- gmidp_data[-midptag[[i]],]
test <- gmidp_data[midptag[[i]],]
plda <- lda(group~., train)
p<- predict(plda, test[,-1])$class
r <- group[midptag[[i]]]
lda_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3 4
## 1 10 3 3 0
## 2 2 9 1 5
## 3 2 0 20 0
## 4 2 5 2 16
##
## [[2]]
## r
## p 1 2 3 4
## 1 15 6 1 1
## 2 2 16 0 1
## 3 1 1 11 0
## 4 2 3 1 19
##
## [[3]]
## r
## p 1 2 3 4
## 1 16 0 1 0
## 2 3 13 3 4
## 3 4 0 15 0
## 4 1 1 1 18
##
## [[4]]
## r
## p 1 2 3 4
## 1 13 1 0 0
## 2 3 14 2 3
## 3 4 0 17 0
## 4 0 7 0 16
##
## [[5]]
## r
## p 1 2 3 4
## 1 10 0 1 0
## 2 2 21 0 4
## 3 4 0 21 0
## 4 4 0 0 13
## 75.75 %
success <- 0
lg_tb <- list()
for(i in 1:5){
train <- gmidp_data[-midptag[[i]],]
test <- gmidp_data[midptag[[i]],]
lg <- multinom(group~., train)
p<- predict(lg, test[,-1])
r <- group[midptag[[i]]]
lg_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## Call:
## multinom(formula = group ~ ., data = train)
##
## Coefficients:
## (Intercept) X1 X2 X3 X4 X5
## 2 -1.152588 -1.0643308 -1.0409963 1.7034721 -1.7457894 0.484605
## 3 7.218486 -0.4645813 -0.5225255 -0.9869686 -0.1287672 1.272399
## 4 -6.828936 0.1667986 -1.8088136 2.5009878 -3.6524196 1.212267
##
## Std. Errors:
## (Intercept) X1 X2 X3 X4 X5
## 2 1.934027 0.2434073 0.2357610 0.3729507 0.2972693 0.2162955
## 3 1.862796 0.2246910 0.2266733 0.2535173 0.2255872 0.2375589
## 4 2.468516 0.2902900 0.3186631 0.4728831 0.4449898 0.2865967
##
## Residual Deviance: 394.2037
## AIC: 430.2037
## [[1]]
## r
## p 1 2 3 4
## 1 10 2 3 1
## 2 2 11 0 6
## 3 2 0 21 0
## 4 2 4 2 14
##
## [[2]]
## r
## p 1 2 3 4
## 1 17 7 1 3
## 2 2 15 0 1
## 3 1 1 11 1
## 4 0 3 1 16
##
## [[3]]
## r
## p 1 2 3 4
## 1 15 0 1 0
## 2 4 13 2 3
## 3 4 0 15 0
## 4 1 1 2 19
##
## [[4]]
## r
## p 1 2 3 4
## 1 14 1 3 1
## 2 2 16 1 3
## 3 4 0 15 0
## 4 0 5 0 15
##
## [[5]]
## r
## p 1 2 3 4
## 1 10 2 1 1
## 2 3 19 0 3
## 3 4 0 21 0
## 4 3 0 0 13
## 75 %
下面是累積到各個 PC 可以解釋的變異比例
如果只有 PC1 就只能解釋 35.29% 的變異
加上 PC2 後可以解釋 61.51% 的變異
再加上 PC3 後可以解釋 78.53% 的變異
success <- 0
lgpc_tb <- list()
for(i in 1:5){
train <- gmidp_data[-midptag[[i]], ]
test <- gmidp_data[midptag[[i]], ]
Pca <- prcomp(train[,-1], center=TRUE, scale.=TRUE)
ptrain <- Pca[["x"]]
ptrain <- data.frame(gmidp_data$group[-midptag[[i]]], ptrain)
colnames(ptrain)[1] <- "group"
lg <- multinom(group~PC1+PC2+PC3, data = ptrain)
ptest <- predict(Pca, test[-1])
p <- predict(lg, ptest)
r <- gmidp_data$group[midptag[[i]]]
lgpc_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## Call:
## multinom(formula = group ~ PC1 + PC2 + PC3, data = ptrain)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3
## 2 -0.1090491 0.4887995 1.2758641 -0.1905764
## 3 -1.1887818 -2.0702048 0.3645244 0.8129034
## 4 -0.9699945 1.3851039 1.6232281 1.7938276
##
## Std. Errors:
## (Intercept) PC1 PC2 PC3
## 2 0.2331447 0.2279549 0.201134 0.2236235
## 3 0.3503022 0.2981167 0.229690 0.2854826
## 4 0.3226705 0.2947342 0.241147 0.3178975
##
## Residual Deviance: 488.4273
## AIC: 512.4273
## [[1]]
## r
## p 1 2 3 4
## 1 10 5 3 1
## 2 3 3 2 6
## 3 2 0 21 0
## 4 1 9 0 14
##
## [[2]]
## r
## p 1 2 3 4
## 1 12 8 1 3
## 2 4 13 0 3
## 3 1 1 12 1
## 4 3 4 0 14
##
## [[3]]
## r
## p 1 2 3 4
## 1 12 3 1 2
## 2 5 7 1 5
## 3 4 1 17 0
## 4 3 3 1 15
##
## [[4]]
## r
## p 1 2 3 4
## 1 10 2 2 3
## 2 3 15 2 4
## 3 6 0 15 0
## 4 1 5 0 12
##
## [[5]]
## r
## p 1 2 3 4
## 1 10 3 0 1
## 2 2 17 0 2
## 3 1 0 22 0
## 4 7 1 0 14
## 66.25 %
success <- 0
rt_tb <- list()
for(i in 1:5){
train <- gmidp_data[-midptag[[i]],]
test <- gmidp_data[midptag[[i]],]
rt <- rpart(group ~ ., data = train)
p <- predict(rt, test) %>% round()
pp <- c()
for(j in 1:dim(p)[1]){
pp[j] <- which(p[j,]==max(p[j,]))
}
p <- pp
r <- group[midptag[[i]]]
rt_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3 4
## 1 10 3 5 6
## 2 4 7 2 6
## 3 1 0 19 0
## 4 1 7 0 9
##
## [[2]]
## r
## p 1 2 3 4
## 1 14 5 3 4
## 2 5 14 0 1
## 3 0 1 9 1
## 4 1 6 1 15
##
## [[3]]
## r
## p 1 2 3 4
## 1 11 1 1 4
## 2 5 10 3 1
## 3 4 0 14 2
## 4 4 3 2 15
##
## [[4]]
## r
## p 1 2 3 4
## 1 11 0 3 4
## 2 2 15 1 1
## 3 6 0 15 0
## 4 1 7 0 14
##
## [[5]]
## r
## p 1 2 3 4
## 1 13 4 2 0
## 2 1 17 0 5
## 3 0 0 20 0
## 4 6 0 0 12
## 66 %
success <- 0
rf_tb <- list()
for(i in 1:5){
train <- gmidp_data[-midptag[[i]],]
test <- gmidp_data[midptag[[i]],]
train$group <- train$group %>% as.factor()
rf <- randomForest(group~., train)
p<- predict(plda, test[,-1])$class
r <- group[midptag[[i]]]
rf_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3 4
## 1 9 3 3 0
## 2 2 9 0 4
## 3 3 0 22 0
## 4 2 5 1 17
##
## [[2]]
## r
## p 1 2 3 4
## 1 15 3 1 1
## 2 2 19 0 1
## 3 1 1 11 0
## 4 2 3 1 19
##
## [[3]]
## r
## p 1 2 3 4
## 1 15 0 1 0
## 2 3 13 1 4
## 3 4 0 16 0
## 4 2 1 2 18
##
## [[4]]
## r
## p 1 2 3 4
## 1 15 0 1 0
## 2 2 18 1 3
## 3 3 0 17 0
## 4 0 4 0 16
##
## [[5]]
## r
## p 1 2 3 4
## 1 10 0 1 0
## 2 2 21 0 4
## 3 4 0 21 0
## 4 4 0 0 13
## 78.5 %
layout(matrix(c(1,2),nrow=1), width=c(4,1))
par(mar=c(5,4,4,0)) #No margin on the right side
plot(rf)
par(mar=c(5,0,4,2)) #No margin on the left side
plot(c(0,1),type="n", axes=F, xlab="", ylab="")
legend("top", colnames(rf$err.rate),col=1:4,cex=0.8,fill=1:4)
tuneRF(train[,-1], train[,1])
## mtry = 2 OOB error = 27.19%
## Searching left ...
## mtry = 1 OOB error = 29.69%
## -0.09195402 0.05
## Searching right ...
## mtry = 4 OOB error = 28.44%
## -0.04597701 0.05
## mtry OOBError
## 1.OOB 1 0.296875
## 2.OOB 2 0.271875
## 4.OOB 4 0.284375
success <- 0
rf_tb <- list()
for(i in 1:5){
train <- gmidp_data[-midptag[[i]],]
test <- gmidp_data[midptag[[i]],]
train$group <- train$group %>% as.factor()
rf <- randomForest(group~., train, ntree=100, mtry=2)
p<- predict(plda, test[,-1])$class
r <- group[midptag[[i]]]
rf_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3 4
## 1 9 3 3 0
## 2 2 9 0 4
## 3 3 0 22 0
## 4 2 5 1 17
##
## [[2]]
## r
## p 1 2 3 4
## 1 15 3 1 1
## 2 2 19 0 1
## 3 1 1 11 0
## 4 2 3 1 19
##
## [[3]]
## r
## p 1 2 3 4
## 1 15 0 1 0
## 2 3 13 1 4
## 3 4 0 16 0
## 4 2 1 2 18
##
## [[4]]
## r
## p 1 2 3 4
## 1 15 0 1 0
## 2 2 18 1 3
## 3 3 0 17 0
## 4 0 4 0 16
##
## [[5]]
## r
## p 1 2 3 4
## 1 10 0 1 0
## 2 2 21 0 4
## 3 4 0 21 0
## 4 4 0 0 13
## 78.5 %
rf$importance
## MeanDecreaseGini
## X1 40.36386
## X2 32.13683
## X3 47.27555
## X4 63.31641
## X5 56.05916
gmidp_data$group <- as.integer(gmidp_data$group)-1
success <- 0
xgb_tb <- list()
for(i in 1:5){
train <- gmidp_data[-midptag[[i]],]
test <- gmidp_data[midptag[[i]],]
dtrain <- xgb.DMatrix(data = as.matrix(train[,-1]),
label = train$group)
dtest <- xgb.DMatrix(data = as.matrix(test[,-1]),
label = test$group)
xgb.model <- xgboost(data = dtrain,
objective='multi:softmax',
num_class=4,
print_every_n = 100,
max_depth = 1,
eta = 0.3,
gamma = 0,
subsample = 1,
colsample_bytree = 1,
min_child_weight = 12,
nround = 150)
# 如果要畫出 xgb 內的所有決策樹,可以用以下函式(但因為會很多,這裡就不畫了)
# xgb.plot.tree(model = xgb.model)
# 預測
p <- predict(xgb.model, dtest)+1
r <- gmidp_data$group[midptag[[i]]]+1
xgb_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3 4
## 1 10 3 4 1
## 2 2 9 0 9
## 3 3 0 21 0
## 4 1 5 1 11
##
## [[2]]
## r
## p 1 2 3 4
## 1 16 7 5 1
## 2 2 15 1 1
## 3 1 1 7 1
## 4 1 3 0 18
##
## [[3]]
## r
## p 1 2 3 4
## 1 14 0 2 1
## 2 6 12 2 0
## 3 3 0 15 0
## 4 1 2 1 21
##
## [[4]]
## r
## p 1 2 3 4
## 1 11 1 2 1
## 2 1 17 2 2
## 3 7 1 15 1
## 4 1 3 0 15
##
## [[5]]
## r
## p 1 2 3 4
## 1 9 0 1 1
## 2 2 21 0 1
## 3 5 0 21 1
## 4 4 0 0 14
## 73 %
success <- 0
lda_tb <- list()
for(i in 1:5){
train <- ghard_data[-hardtag[[i]],]
test <- ghard_data[hardtag[[i]],]
plda <- lda(group~., train)
p<- predict(plda, test[,-1])$class
r <- group[hardtag[[i]]]
lda_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3 4
## 1 9 1 1 0
## 2 6 13 0 4
## 3 1 2 25 0
## 4 0 1 0 17
##
## [[2]]
## r
## p 1 2 3 4
## 1 13 10 1 0
## 2 3 10 0 4
## 3 2 0 12 0
## 4 2 6 0 17
##
## [[3]]
## r
## p 1 2 3 4
## 1 13 2 0 0
## 2 4 11 0 3
## 3 4 0 20 0
## 4 3 1 0 19
##
## [[4]]
## r
## p 1 2 3 4
## 1 11 4 4 0
## 2 6 11 0 3
## 3 2 0 15 0
## 4 1 7 0 16
##
## [[5]]
## r
## p 1 2 3 4
## 1 11 4 4 0
## 2 5 8 0 0
## 3 2 1 18 0
## 4 2 8 0 17
## 71.5 %
success <- 0
lg_tb <- list()
for(i in 1:5){
train <- ghard_data[-hardtag[[i]],]
test <- ghard_data[hardtag[[i]],]
lg <- multinom(group~., train)
p<- predict(lg, test[,-1])
r <- group[hardtag[[i]]]
lg_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## Call:
## multinom(formula = group ~ ., data = train)
##
## Coefficients:
## (Intercept) X1 X2 X3 X4 X5
## 2 -6.732855 0.5353301 -1.851090 1.704793 -0.3371828 0.3845232
## 3 15.979502 -0.2248725 2.324965 -4.658457 1.3145202 0.8509714
## 4 -24.419680 2.4504441 -5.384352 5.057537 -2.8939583 2.0507335
##
## Std. Errors:
## (Intercept) X1 X2 X3 X4 X5
## 2 1.643493 0.2069970 0.3141770 0.3322866 0.2141052 0.2389116
## 3 3.258931 0.3565227 0.7011019 0.9105876 0.4043250 0.3930075
## 4 3.742443 0.3864056 0.7210244 0.7145816 0.4869484 0.4364483
##
## Residual Deviance: 365.196
## AIC: 401.196
## [[1]]
## r
## p 1 2 3 4
## 1 9 2 1 0
## 2 6 12 0 4
## 3 1 2 25 0
## 4 0 1 0 17
##
## [[2]]
## r
## p 1 2 3 4
## 1 13 11 1 0
## 2 4 10 0 4
## 3 2 0 12 0
## 4 1 5 0 17
##
## [[3]]
## r
## p 1 2 3 4
## 1 13 1 0 0
## 2 4 11 0 3
## 3 4 0 20 0
## 4 3 2 0 19
##
## [[4]]
## r
## p 1 2 3 4
## 1 14 6 4 0
## 2 5 9 0 4
## 3 1 0 15 0
## 4 0 7 0 15
##
## [[5]]
## r
## p 1 2 3 4
## 1 11 6 4 0
## 2 6 9 0 1
## 3 2 0 18 0
## 4 1 6 0 16
## 71.25 %
下面是累積到各個 PC 可以解釋的變異比例
如果只有 PC1 就只能解釋 36.20% 的變異
加上 PC2 後可以解釋 64.17% 的變異
再加上 PC3 後可以解釋 82.13% 的變異
success <- 0
lgpc_tb <- list()
for(i in 1:5){
train <- ghard_data[-hardtag[[i]], ]
test <- ghard_data[hardtag[[i]], ]
Pca <- prcomp(train[,-1], center=TRUE, scale.=TRUE)
ptrain <- Pca[["x"]]
ptrain <- data.frame(ghard_data$group[-hardtag[[i]]], ptrain)
colnames(ptrain)[1] <- "group"
lg <- multinom(group~PC1+PC2+PC3, data = ptrain)
ptest <- predict(Pca, test[-1])
p <- predict(lg, ptest)
r <- ghard_data$group[midptag[[i]]]
lgpc_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## Call:
## multinom(formula = group ~ PC1 + PC2 + PC3, data = ptrain)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3
## 2 0.0213224 -0.1896045 -0.04104361 -0.06122511
## 3 -1.0211892 -1.7969429 -0.69353177 1.92821923
## 4 -0.0963046 -0.0516834 0.64206426 1.01283193
##
## Std. Errors:
## (Intercept) PC1 PC2 PC3
## 2 0.1898186 0.1449818 0.1473594 0.1832821
## 3 0.2953453 0.2555225 0.2257196 0.3140569
## 4 0.2024693 0.1530437 0.1720265 0.2156909
##
## Residual Deviance: 647.8498
## AIC: 671.8498
## [[1]]
## r
## p 1 2 3 4
## 1 6 8 0 3
## 2 5 3 2 2
## 3 4 4 22 5
## 4 1 2 2 11
##
## [[2]]
## r
## p 1 2 3 4
## 1 7 13 0 3
## 2 1 5 2 1
## 3 2 3 11 4
## 4 10 5 0 13
##
## [[3]]
## r
## p 1 2 3 4
## 1 6 3 0 1
## 2 12 8 1 4
## 3 3 2 15 2
## 4 3 1 4 15
##
## [[4]]
## r
## p 1 2 3 4
## 1 9 7 3 4
## 2 4 5 1 2
## 3 0 4 13 0
## 4 7 6 2 13
##
## [[5]]
## r
## p 1 2 3 4
## 1 5 7 0 2
## 2 11 7 4 1
## 3 1 2 15 2
## 4 3 5 3 12
## 50.25 %
success <- 0
rt_tb <- list()
for(i in 1:5){
train <- ghard_data[-hardtag[[i]],]
test <- ghard_data[hardtag[[i]],]
rt <- rpart(group ~ ., data = train)
p <- predict(rt, test) %>% round()
pp <- c()
for(j in 1:dim(p)[1]){
pp[j] <- which(p[j,]==max(p[j,]))
}
p <- pp
r <- group[hardtag[[i]]]
rt_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3 4
## 1 11 8 2 6
## 2 3 6 1 4
## 3 2 1 23 1
## 4 0 2 0 10
##
## [[2]]
## r
## p 1 2 3 4
## 1 13 10 2 6
## 2 1 8 0 4
## 3 2 5 11 1
## 4 4 3 0 10
##
## [[3]]
## r
## p 1 2 3 4
## 1 10 4 1 1
## 2 8 8 1 10
## 3 4 2 18 0
## 4 2 0 0 11
##
## [[4]]
## r
## p 1 2 3 4
## 1 11 8 2 5
## 2 4 7 1 3
## 3 2 1 16 0
## 4 3 6 0 11
##
## [[5]]
## r
## p 1 2 3 4
## 1 9 3 2 3
## 2 7 4 2 1
## 3 1 1 15 1
## 4 3 13 3 12
## 56 %
success <- 0
rf_tb <- list()
for(i in 1:5){
train <- ghard_data[-hardtag[[i]],]
test <- ghard_data[hardtag[[i]],]
train$group <- train$group %>% as.factor()
rf <- randomForest(group~., train)
p<- predict(plda, test[,-1])$class
r <- group[hardtag[[i]]]
rf_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3 4
## 1 10 4 1 0
## 2 5 12 0 3
## 3 1 0 25 0
## 4 0 1 0 18
##
## [[2]]
## r
## p 1 2 3 4
## 1 13 6 1 0
## 2 4 14 0 3
## 3 1 0 12 0
## 4 2 6 0 18
##
## [[3]]
## r
## p 1 2 3 4
## 1 13 2 0 0
## 2 4 10 0 1
## 3 4 0 20 0
## 4 3 2 0 21
##
## [[4]]
## r
## p 1 2 3 4
## 1 12 4 4 0
## 2 6 11 0 2
## 3 1 0 15 0
## 4 1 7 0 17
##
## [[5]]
## r
## p 1 2 3 4
## 1 11 4 4 0
## 2 5 8 0 0
## 3 2 1 18 0
## 4 2 8 0 17
## 73.75 %
layout(matrix(c(1,2),nrow=1), width=c(4,1))
par(mar=c(5,4,4,0)) #No margin on the right side
plot(rf)
par(mar=c(5,0,4,2)) #No margin on the left side
plot(c(0,1),type="n", axes=F, xlab="", ylab="")
legend("top", colnames(rf$err.rate),col=1:4,cex=0.8,fill=1:4)
tuneRF(train[,-1], train[,1])
## mtry = 2 OOB error = 33.44%
## Searching left ...
## mtry = 1 OOB error = 38.44%
## -0.1495327 0.05
## Searching right ...
## mtry = 4 OOB error = 37.5%
## -0.1214953 0.05
## mtry OOBError
## 1.OOB 1 0.384375
## 2.OOB 2 0.334375
## 4.OOB 4 0.375000
success <- 0
rf_tb <- list()
for(i in 1:5){
train <- ghard_data[-hardtag[[i]],]
test <- ghard_data[hardtag[[i]],]
train$group <- train$group %>% as.factor()
rf <- randomForest(group~., train, ntree=150, mtry=2)
p<- predict(plda, test[,-1])$class
r <- group[hardtag[[i]]]
rf_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3 4
## 1 10 4 1 0
## 2 5 12 0 3
## 3 1 0 25 0
## 4 0 1 0 18
##
## [[2]]
## r
## p 1 2 3 4
## 1 13 6 1 0
## 2 4 14 0 3
## 3 1 0 12 0
## 4 2 6 0 18
##
## [[3]]
## r
## p 1 2 3 4
## 1 13 2 0 0
## 2 4 10 0 1
## 3 4 0 20 0
## 4 3 2 0 21
##
## [[4]]
## r
## p 1 2 3 4
## 1 12 4 4 0
## 2 6 11 0 2
## 3 1 0 15 0
## 4 1 7 0 17
##
## [[5]]
## r
## p 1 2 3 4
## 1 11 4 4 0
## 2 5 8 0 0
## 3 2 1 18 0
## 4 2 8 0 17
## 73.75 %
rf$importance
## MeanDecreaseGini
## X1 39.32964
## X2 45.70967
## X3 76.47191
## X4 41.88933
## X5 35.88388
ghard_data$group <- as.integer(ghard_data$group)-1
success <- 0
xgb_tb <- list()
for(i in 1:5){
train <- ghard_data[-hardtag[[i]],]
test <- ghard_data[hardtag[[i]],]
dtrain <- xgb.DMatrix(data = as.matrix(train[,-1]),
label = train$group)
dtest <- xgb.DMatrix(data = as.matrix(test[,-1]),
label = test$group)
xgb.model <- xgboost(data = dtrain,
objective='multi:softmax',
num_class=4,
print_every_n = 100,
max_depth = 2,
eta = 0.3,
gamma = 0,
subsample = 1,
colsample_bytree = 1,
min_child_weight = 12,
nround = 150)
# 如果要畫出 xgb 內的所有決策樹,可以用以下函式(但因為會很多,這裡就不畫了)
# xgb.plot.tree(model = xgb.model)
# 預測
p <- predict(xgb.model, dtest)+1
r <- ghard_data$group[hardtag[[i]]]+1
xgb_tb[[i]] <- table(p, r)
success <- success + sum(p==r)
}
## [[1]]
## r
## p 1 2 3 4
## 1 13 2 3 0
## 2 0 12 0 5
## 3 3 1 22 1
## 4 0 2 1 15
##
## [[2]]
## r
## p 1 2 3 4
## 1 12 11 0 2
## 2 1 10 0 7
## 3 2 2 12 0
## 4 5 3 1 12
##
## [[3]]
## r
## p 1 2 3 4
## 1 14 2 1 1
## 2 3 7 0 3
## 3 3 1 19 0
## 4 4 4 0 18
##
## [[4]]
## r
## p 1 2 3 4
## 1 13 5 2 2
## 2 5 14 1 3
## 3 1 0 16 0
## 4 1 3 0 14
##
## [[5]]
## r
## p 1 2 3 4
## 1 12 7 6 3
## 2 4 7 2 2
## 3 1 1 14 0
## 4 3 6 0 12
## 67 %
下表是不同難度資料的預測準確度,從表中看出,資料如預期的預測成功率越來越低
不過在 Multivariate normal Data 中,可以看出相對簡單的模型表現也不輸複雜的模型
LDA 和 Logistic Regression 的表現一直都還不錯
而 Random Forest 除了難度2的資料以外都是表現最好的
最後是 CART 的表現是最差的,單一樹難以區分後面較為複雜的資料
| LDA | LR | LR(PCA) | CART | RF | XGB | |
|---|---|---|---|---|---|---|
| level 1 | 99 % | 98 % | 99.33 % | 97.33 % | 99.33 % | 99 % |
| level 2 | 90 % | 89 % | 87.67 % | 80 % | 88.33 % | 86.67 % |
| level 3 | 75.75 % | 75 % | 66.25 % | 66 % | 78.5 % | 73 % |
| level 4 | 71.5 % | 71.25 % | 50.25 % | 56 % | 73.75 % | 67 % |
https://hans0803.shinyapps.io/Multivariate_normal_Data_Creater/ (可固定數值的資料生成器)
https://hans0803.shinyapps.io/Multinormal_Data_Builder/ (隨機資料生成器)
https://bookdown.org/yihui/rmarkdown-cookbook/ (Rmarkdown cookbook)
https://bookdown.org/xiao/RAnalysisBook/ (markdown book)